********************************************************************************
* COMPLETE DATA PREPARATION WITH ALL MERGES
* Including coal plant, China air quality, and China media data
********************************************************************************

clear all
set more off

** SELECT WORKING DIRECTORIES **
*global dir  "" // enter your working directory here
global dir "C:\Users\ev42\Dropbox\Korea\Replication"  
global working "$dir\working"
global data "$dir\Data"
global output "$dir\Output"

cd "$data"


********************************************************************************
* PART 1: PREPARE COAL PLANT DATA
********************************************************************************

preserve
* Import coal plant data
import excel "230712 coal_plant.xlsx", clear firstrow allstring

* The first row contains Korean headers, drop it
drop in 1

* Drop rows with missing plant names
drop if 발전소 == "" | missing(발전소)

* Drop the unnamed columns (columns 5 and 6 which are subdivision info)
drop E F  // These correspond to Unnamed: 4 and Unnamed: 5

* Rename using alphabetical column names that Stata assigns
* Column G-I = 2007 (kw, units, total), J-L = 2008, etc.
rename G kw_2007
rename H units_2007  
rename I total_2007

rename J kw_2008
rename K units_2008
rename L total_2008

rename M kw_2009
rename N units_2009
rename O total_2009

rename P kw_2010
rename Q units_2010
rename R total_2010

rename S kw_2011
rename T units_2011
rename U total_2011

rename V kw_2012
rename W units_2012
rename X total_2012

rename Y kw_2013
rename Z units_2013
rename AA total_2013

rename AB kw_2014
rename AC units_2014
rename AD total_2014

rename AE kw_2015
rename AF units_2015
rename AG total_2015

rename AH kw_2016
rename AI units_2016
rename AJ total_2016

rename AK kw_2017
rename AL units_2017
rename AM total_2017

rename AN kw_2018
rename AO units_2018
rename AP total_2018

rename AQ kw_2019
rename AR units_2019
rename AS total_2019

rename AT kw_2020
rename AU units_2020
rename AV total_2020

rename AW kw_2021
rename AX units_2021
rename AY total_2021

* Rename Korean column names
rename 위치 region

* Generate unique plant ID
gen plant_id = _n

* Reshape to long format
reshape long kw_ units_ total_, i(plant_id 발전종류 연료 발전소 region) j(year)

* Convert string to numeric
destring kw_ units_ total_, replace force

* Replace missing and negative values with 0
foreach var of varlist kw_ units_ total_ {
    replace `var' = 0 if `var' == . | `var' < 0
}

* Rename for clarity
rename total_ kw_capacity
rename units_ plant_count
rename kw_ kw_unit

* Collapse to region-year level
collapse (sum) kw_sum=kw_capacity plant_sum=plant_count, by(region year)

* Create the main variable
gen kw_plant = kw_sum

* Map regions to REGION_KOR numeric codes
gen REGION_KOR = .
replace REGION_KOR = 1 if strpos(region, "서울") > 0
replace REGION_KOR = 2 if strpos(region, "부산") > 0  
replace REGION_KOR = 3 if strpos(region, "대구") > 0
replace REGION_KOR = 4 if strpos(region, "인천") > 0
replace REGION_KOR = 5 if strpos(region, "광주") > 0
replace REGION_KOR = 6 if strpos(region, "대전") > 0
replace REGION_KOR = 7 if strpos(region, "울산") > 0
replace REGION_KOR = 8 if strpos(region, "경기") > 0
replace REGION_KOR = 9 if strpos(region, "강원") > 0
replace REGION_KOR = 10 if strpos(region, "충북") > 0 | strpos(region, "충청북") > 0
replace REGION_KOR = 11 if strpos(region, "충남") > 0 | strpos(region, "충청남") > 0
replace REGION_KOR = 12 if strpos(region, "전북") > 0 | strpos(region, "전라북") > 0
replace REGION_KOR = 13 if strpos(region, "전남") > 0 | strpos(region, "전라남") > 0
replace REGION_KOR = 14 if strpos(region, "경북") > 0 | strpos(region, "경상북") > 0
replace REGION_KOR = 15 if strpos(region, "경남") > 0 | strpos(region, "경상남") > 0
replace REGION_KOR = 16 if strpos(region, "제주") > 0

* Keep only matched regions  
drop if missing(REGION_KOR)

* Create log transformation
gen ln_kw_plant = log(1 + kw_plant)

* Keep relevant variables
keep REGION_KOR year kw_sum plant_sum kw_plant ln_kw_plant

* Save for merging
save "coal_plant_temp.dta", replace
restore

********************************************************************************
* PART 2: PREPARE CHINA AIR QUALITY DATA
********************************************************************************

preserve
import excel "230824 china_region_air_quality.xlsx", clear firstrow

* Collapse to date level (average across regions)
collapse (mean) china_pm25=pm25, by(date)

* Drop missing
drop if missing(date) | missing(china_pm25)

* Ensure date is in Stata format
capture confirm numeric variable date
if _rc {
    gen date_temp = date(date, "YMD")  // Adjust format as needed
    drop date
    rename date_temp date
}
format date %td
rename date WP4

save "china_air_temp.dta", replace
restore

********************************************************************************
* PART 3: PREPARE CHINA MEDIA DATA
********************************************************************************

preserve
import delimited "media.csv", clear

* Convert date (M/D/Y format)
gen WP4 = date(date, "MDY")
format WP4 %td

* Rename count variable
rename count chn_media_count

* Keep only needed variables
keep WP4 chn_media_count

save "china_media_temp.dta", replace
restore

********************************************************************************
* PART 4: MAIN DATA PREPARATION AND INITIAL MERGES
********************************************************************************

* Open raw Gallup data
use "Gallup_WP.dta", clear 

* Merge with air quality data
merge m:1 WP4 REGION_KOR using "AirqualityNew.dta"
drop if _merge==2
drop _merge
  
* Insert rain and temperature data
merge m:1 WP4 REGION_KOR using "RainandTemp.dta"
drop if _merge==2
drop _merge

destring temp, replace force
destring rain, replace force

********************************************************************************
* PART 5: CREATE TIME VARIABLES AND MERGE NEW DATASETS
********************************************************************************

* Create year variable for merging
gen year = year(WP4)
gen month = month(WP4)
gen dayofweek = dow(WP4)

* Merge coal plant data
merge m:1 REGION_KOR year using "coal_plant_temp.dta", ///
    keepusing(kw_sum plant_sum kw_plant ln_kw_plant)
    
* Handle non-matches for coal plant data
replace kw_plant = 0 if _merge == 1 | missing(kw_plant)
replace kw_sum = 0 if _merge == 1 | missing(kw_sum)
replace plant_sum = 0 if _merge == 1 | missing(plant_sum)
replace ln_kw_plant = 0 if _merge == 1 | missing(ln_kw_plant)
drop if _merge == 2
drop _merge

* Merge China air quality data
merge m:1 WP4 using "china_air_temp.dta", keepusing(china_pm25)
drop if _merge == 2
drop _merge

* Merge China media data
merge m:1 WP4 using "china_media_temp.dta", keepusing(chn_media_count)
replace chn_media_count = 0 if _merge == 1 | missing(chn_media_count)
drop if _merge == 2
drop _merge

********************************************************************************
* PART 6: CREATE ALL ANALYSIS VARIABLES
********************************************************************************

* No meteorological data from this source before 2014
keep if year > 2014

* Satisfaction with Quality of Air
gen air = 99
replace air = 0 if WP94 == 1
replace air = 1 if WP94 == 2
replace air = . if air == 99
 
* Dissatisfied with govt efforts to preserve environment
gen environment = .
replace environment = 0 if WP132 == 1
replace environment = .5 if WP132 == 3
replace environment = 1 if WP132 == 2
 
* Do you have health problems
gen health = .
replace health = 0 if WP23 == 1
replace health = 1 if WP23 == 2
 
* 4 point global warming threat question. Only asked in 1 year. Very little variation
replace WP4652 = . if WP4652 == 5
 
* Generate an indicator for whether the region is in the Seoul area
gen Seoul = 0
replace Seoul = 1 if REGION_KOR == 1 | REGION_KOR == 4 | REGION_KOR == 8
 
* Wind Direction check - Create quadrants
gen Compass = 0
replace Compass = 1 if lagWindDirection > -1 & lagWindDirection < 91
replace Compass = 2 if lagWindDirection > 90 & lagWindDirection < 181
replace Compass = 3 if lagWindDirection > 180 & lagWindDirection < 270
replace Compass = 4 if lagWindDirection > 269 & lagWindDirection < 361
replace Compass = . if Compass == 0

label define wind 1 "NE" 2 "SE" 3 "SW" 4 "NW" 
label values Compass wind  
 
* Leadership of countries questions - Code DK as in the middle
gen china = .
replace china = 0 if WP156 == 1
replace china = 1 if WP156 == 2
replace china = .5 if WP156 == 3
   
gen rok = .
replace rok = 0 if WP150 == 1
replace rok = 1 if WP150 == 2
replace rok = .5 if WP150 == 3
    
gen japan = .
replace japan = 0 if WP157 == 1
replace japan = 1 if WP157 == 2
replace japan = .5 if WP157 == 3
	
gen russia = .
replace russia = 0 if WP155 == 1
replace russia = 1 if WP155 == 2
replace russia = .5 if WP155 == 3
	
gen usa = .
replace usa = 0 if WP151 == 1
replace usa = 1 if WP151 == 2
replace usa = .5 if WP151 == 3
   
gen fra = .
replace fra = 0 if WP154 == 1
replace fra = 1 if WP154 == 2
replace fra = .5 if WP154 == 3
  
* This is a question about confidence in national government
gen natgov = .
replace natgov = 0 if WP139 == 1
replace natgov = 1 if WP139 == 2
replace natgov = .5 if WP139 == 3

* Approval of President
gen presap = .
replace presap = 0 if WP13125 == 1
replace presap = 1 if WP13125 == 2
replace presap = .5 if WP13125 == 3
     
gen water = .
replace water = 0 if WP95 == 1
replace water = 1 if WP95 == 2
replace water = .5 if WP95 == 3
	 
gen city = .
replace city = 0 if WP83 == 1
replace city = 1 if WP83 == 2
replace city = .5 if WP83 == 3
  
gen move = .
replace move = 0 if WP85 == 1
replace move = 1 if WP85 == 2
replace move = .5 if WP85 == 3
   
* Demographics
gen Female = WP1219
gen Age = WP1220
replace Age = . if WP1220 == 100

* Education
replace WP3117 = . if WP3117 > 3

* PM25 fine dust measures - Yellow code warning indicators
gen yellow = 0
replace yellow = 1 if pm25 > 100 & pm25 != .
replace yellow = 1 if lagpm25 > 100 & lagpm25 != .

gen yellowaqi = 0
replace yellowaqi = 1 if lagaqi > 100 & lagaqi != .
 
gen orange = 0
replace orange = 1 if pm25 > 100 & pm25 != .
replace orange = 1 if lagpm25 > 100 & lagpm25 != .

gen orangeaqi = 0
replace orangeaqi = 1 if lagaqi > 100 & lagaqi != .
 
gen red = 0
replace red = 1 if pm25 > 100 & pm25 != .
replace red = 1 if lagpm25 > 100 & lagpm25 != .
 
gen redaqi = 0
replace redaqi = 1 if aqi > 150 & pm25 != .
replace redaqi = 1 if lagaqi > 150 & lagaqi != .
 
gen maxaqi = max(aqi, lagaqi)
gen maxpm25 = max(pm25, lagpm25)
 
* Cluster variable
gen regyear = year*100 + REGION_KOR

*Severe Covid 
gen covid = (year == 2022)
 
* Political period indicators
gen left = 0
replace left = 1 if year == 2018 | year == 2019 | year == 2020
  
gen left_d = -1
replace left_d = 1 if year == 2018 | year == 2019 | year == 2020
 
* Clean region variable
replace REGION_KOR = . if REGION_KOR > 16
  
* Recode ideology items
recode WP128 (1=1) (2=0) (3=.5) (4=.5), gen(item1)
recode WP129 (1=1) (2=0) (3=.5) (4=.5), gen(item2)
recode WP130 (1=1) (2=0) (3=.5) (4=.5), gen(item3)
recode WP131 (1=1) (2=0) (3=.5) (4=.5), gen(item4)
recode WP9050 (1=1) (2=0) (3=.5) (4=.5), gen(item5)
recode WP119 (1=1) (2=0) (3=.5) (4=.5), gen(item6)

gen presideo = presap*left_d
reg presideo item1 item2 item3 item4 item5 item6 i.year 

alpha item1 item2 item3 item4 item5 item6, std item detail

gen ideology = (item2 + item3 + item4 + item5)/4

reg presap c.ideology##left Female Age WP3117 INCOME_2 i.year

********************************************************************************
* PART 7: VARIABLE LABELS
********************************************************************************

label variable lagpm25 "AQI pm2.5 (lag)"
label variable lagpm25pr "AQI pm2.5 (lag), predicted from wind"
label variable lagaqi "Composite AQI (lag)"
label variable yellowaqi "Yellow Warning"
label variable orangeaqi "Orange Warning"
label variable redaqi "Red Warning"
label variable air "Dissatisfied w. Air"
label variable pm25 "AQI, pm2.5"
label variable china "Disapprove China Leadership"
label variable rok "Disapprove ROK Leadership"
label variable usa "Disapprove USA Leadership"
label variable natgov "No Confidence National Government"
label variable environment "Dissatisfied Govt Preserve Environment"
label variable WP3117 "Education"
label variable INCOME_2 "Household Income"
label variable presap "Presidential Approval"
label variable ideology "Ideology"

* New variable labels
label variable kw_plant "Coal Plant Capacity"
label variable ln_kw_plant "Log(Coal Plant Capacity)"
label variable china_pm25 "China Average PM2.5"
label variable chn_media_count "China Media Mentions"

label variable air "Dissatisfied w. Air"
label variable ideology "Ideology"
label variable left "left"
label variable Age "Age"
label variable INCOME_2 "Household Income"

/* For i.Female (assuming 1=Female and 0=Male is the base) */
capture label drop female_lbl
label define female_lbl 1 "Male" 2 "Female"
label values Female female_lbl

/* For i.WP3117 (assuming 1=base, 2=Lev.2, 3=Lev.3) */
label define educ_label 2 "Education Lev.2" 3 "Education Lev.3"
label values WP3117 educ_label

********************************************************************************
* PART 8: FINAL DATA TRANSFORMATIONS
********************************************************************************

* Change the scale of the AQI variables
replace lagpm25 = lagpm25/100
replace lagaqi = lagaqi/100
replace lagpm25Ganghwa = lagpm25Ganghwa/100
  
* Create squared terms
gen lagaqisq = lagaqi*lagaqi
gen lagpm25sq = lagpm25*lagpm25

********************************************************************************
* PART 9: CLEAN UP AND SAVE
********************************************************************************

* Clean up temporary files
capture erase "coal_plant_temp.dta"
capture erase "china_air_temp.dta"
capture erase "china_media_temp.dta"

* Save final dataset
save "RegressiondataIO.dta", replace
